/*
 *  Copyright 2019, dqsjqian(Mr.Zhang).  All right reserved.
 *
 *  THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF dqsjqian(Mr.Zhang).
 *  LTD.  THE CONTENTS OF THIS FILE MAY NOT BE DISCLOSED TO THIRD
 *  PARTIES, COPIED OR DUPLICATED IN ANY FORM, IN WHOLE OR IN PART,
 *  WITHOUT THE PRIOR WRITTEN PERMISSION OF dqsjqian(Mr.Zhang).
 *
 *
 *  Edit History:
 *
 *    2019-08-28 - Created by dqsjqian(Mr.Zhang) dqsjqian@163.com
 *
 */


 /* *************************************
 *  Tuning parameters
 ***************************************/
 /*!CQH_FORCE_MEMORY_ACCESS :
  * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
  * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
  * The below switch allow to select different access method for improved performance.
  * Method 0 (default) : use `memcpy()`. Safe and portable.
  * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
  *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
  * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
  *            It can generate buggy code on targets which do not support unaligned memory accesses.
  *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
  * Prefer these methods in priority order (0 > 1 > 2)
  */


#ifndef CQH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
                        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
                        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
#    define CQH_FORCE_MEMORY_ACCESS 2
#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
                    || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
                    || defined(__ARM_ARCH_7S__) ))
#    define CQH_FORCE_MEMORY_ACCESS 1
#  endif
#endif

  /*!CQH_ACCEPT_NULL_INPUT_POINTER :
   * If input pointer is NULL, CQHash default behavior is to dereference it, triggering a segfault.
   * When this macro is enabled, CQHash actively checks input for null pointer.
   * It it is, result for null input pointers is the same as a null-length input.
   */
#ifndef CQH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
#  define CQH_ACCEPT_NULL_INPUT_POINTER 0
#endif

   /*!CQH_FORCE_ALIGN_CHECK :
	* This is a minor performance trick, only useful with lots of very small keys.
	* It means : check for aligned/unaligned input.
	* The check costs one initial branch per hash;
	* set it to 0 when the input is guaranteed to be aligned,
	* or when alignment doesn't matter for performance.
	*/
#ifndef CQH_FORCE_ALIGN_CHECK /* can be defined externally */
#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
#    define CQH_FORCE_ALIGN_CHECK 0
#  else
#    define CQH_FORCE_ALIGN_CHECK 1
#  endif
#endif


	/* *************************************
	*  Includes & Memory related functions
	***************************************/
	/*! Modify the local functions below should you wish to use some other memory routines
	*   for malloc(), free() */
#include <stdlib.h>
static void* CQH_malloc(size_t s) { return malloc(s); }
static void  CQH_free(void* p) { free(p); }
/*! and for memcpy() */
#include <string.h>
static void* CQH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest, src, size); }

#include <assert.h>   /* assert */

#define CQH_STATIC_LINKING_ONLY
#include "cqhash.h"


/* *************************************
*  Compiler Specific Options
***************************************/
#ifdef _MSC_VER    /* Visual Studio */
#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
#  define CQH_FORCE_INLINE static __forceinline
#  define CQH_NO_INLINE static __declspec(noinline)
#else
#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
#    ifdef __GNUC__
#      define CQH_FORCE_INLINE static inline __attribute__((always_inline))
#      define CQH_NO_INLINE static __attribute__((noinline))
#    else
#      define CQH_FORCE_INLINE static inline
#      define CQH_NO_INLINE static
#    endif
#  else
#    define CQH_FORCE_INLINE static
#    define CQH_NO_INLINE static
#  endif /* __STDC_VERSION__ */
#endif


/* *************************************
*  Basic Types
***************************************/
#ifndef MEM_MODULE
# if !defined (__VMS) \
  && (defined (__cplusplus) \
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
#   include <stdint.h>
typedef uint8_t  BYTE;
typedef uint16_t U16;
typedef uint32_t U32;
# else
typedef unsigned char      BYTE;
typedef unsigned short     U16;
typedef unsigned int       U32;
# endif
#endif


/* ===   Memory access   === */

#if (defined(CQH_FORCE_MEMORY_ACCESS) && (CQH_FORCE_MEMORY_ACCESS==2))

/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
static U32 CQH_read32(const void* memPtr) { return *(const U32*)memPtr; }

#elif (defined(CQH_FORCE_MEMORY_ACCESS) && (CQH_FORCE_MEMORY_ACCESS==1))

/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union { U32 u32; } __attribute__((packed)) unalign;
static U32 CQH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }

#else

/* portable and safe solution. Generally efficient.
 * see : http://stackoverflow.com/a/32095106/646947
 */
static U32 CQH_read32(const void* memPtr)
{
	U32 val;
	memcpy(&val, memPtr, sizeof(val));
	return val;
}

#endif   /* CQH_FORCE_DIRECT_MEMORY_ACCESS */


/* ===   Endianess   === */
typedef enum { CQH_bigEndian = 0, CQH_littleEndian = 1 } CQH_endianess;

/* CQH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
#ifndef CQH_CPU_LITTLE_ENDIAN
static int CQH_isLittleEndian(void)
{
	const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
	return one.c[0];
}
#   define CQH_CPU_LITTLE_ENDIAN   CQH_isLittleEndian()
#endif




/* ****************************************
*  Compiler-specific Functions and Macros
******************************************/
#define CQH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)

/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
#if defined(_MSC_VER)
#  define CQH_rotl32(x,r) _rotl(x,r)
#  define CQH_rotl64(x,r) _rotl64(x,r)
#else
#  define CQH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
#  define CQH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
#endif

#if defined(_MSC_VER)     /* Visual Studio */
#  define CQH_swap32 _byteswap_ulong
#elif CQH_GCC_VERSION >= 403
#  define CQH_swap32 __builtin_bswap32
#else
static U32 CQH_swap32(U32 x)
{
	return  ((x << 24) & 0xff000000) |
		((x << 8) & 0x00ff0000) |
		((x >> 8) & 0x0000ff00) |
		((x >> 24) & 0x000000ff);
}
#endif


/* ***************************
*  Memory reads
*****************************/
typedef enum { CQH_aligned, CQH_unaligned } CQH_alignment;

CQH_FORCE_INLINE U32 CQH_readLE32(const void* ptr)
{
	return CQH_CPU_LITTLE_ENDIAN ? CQH_read32(ptr) : CQH_swap32(CQH_read32(ptr));
}

static U32 CQH_readBE32(const void* ptr)
{
	return CQH_CPU_LITTLE_ENDIAN ? CQH_swap32(CQH_read32(ptr)) : CQH_read32(ptr);
}

CQH_FORCE_INLINE U32
CQH_readLE32_align(const void* ptr, CQH_alignment align)
{
	if (align == CQH_unaligned) {
		return CQH_readLE32(ptr);
	}
	else {
		return CQH_CPU_LITTLE_ENDIAN ? *(const U32*)ptr : CQH_swap32(*(const U32*)ptr);
	}
}


/* *************************************
*  Macros
***************************************/
#define CQH_STATIC_ASSERT(c)  { enum { CQH_sa = 1/(int)(!!(c)) }; }  /* use after variable declarations */
CQH_PUBLIC_API unsigned CQH_versionNumber(void) { return CQH_VERSION_NUMBER; }


/* *******************************************************************
*  32-bit hash functions
*********************************************************************/
static const U32 PRIME32_1 = 2654435761U;   /* 0b10011110001101110111100110110001 */
static const U32 PRIME32_2 = 2246822519U;   /* 0b10000101111010111100101001110111 */
static const U32 PRIME32_3 = 3266489917U;   /* 0b11000010101100101010111000111101 */
static const U32 PRIME32_4 = 668265263U;   /* 0b00100111110101001110101100101111 */
static const U32 PRIME32_5 = 374761393U;   /* 0b00010110010101100110011110110001 */

static U32 CQH32_round(U32 acc, U32 input)
{
	acc += input * PRIME32_2;
	acc = CQH_rotl32(acc, 13);
	acc *= PRIME32_1;
#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(CQH_ENABLE_AUTOVECTORIZE)
	/* UGLY HACK:
	 * This inline assembly hack forces acc into a normal register. This is the
	 * only thing that prevents GCC and Clang from autovectorizing the CQH32 loop
	 * (pragmas and attributes don't work for some resason) without globally
	 * disabling SSE4.1.
	 *
	 * The reason we want to avoid vectorization is because despite working on
	 * 4 integers at a time, there are multiple factors slowing CQH32 down on
	 * SSE4:
	 * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!)
	 *   making it slightly slower to multiply four integers at once compared to four
	 *   integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is
	 *   still not worth it to go into SSE just to multiply unless doing a long operation.
	 *
	 * - Four instructions are required to rotate,
	 *      movqda tmp,  v // not required with VEX encoding
	 *      pslld  tmp, 13 // tmp <<= 13
	 *      psrld  v,   19 // x >>= 19
	 *      por    v,  tmp // x |= tmp
	 *   compared to one for scalar:
	 *      roll   v, 13    // reliably fast across the board
	 *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
	 *
	 * - Instruction level parallelism is actually more beneficial here because the
	 *   SIMD actually serializes this operation: While v1 is rotating, v2 can load data,
	 *   while v3 can multiply. SSE forces them to operate together.
	 *
	 * How this hack works:
	 * __asm__(""       // Declare an assembly block but don't declare any instructions
	 *          :       // However, as an Input/Output Operand,
	 *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
	 *          (acc)   // and set acc as the operand
	 * );
	 *
	 * Because of the 'r', the compiler has promised that seed will be in a
	 * general purpose register and the '+' says that it will be 'read/write',
	 * so it has to assume it has changed. It is like volatile without all the
	 * loads and stores.
	 *
	 * Since the argument has to be in a normal register (not an SSE register),
	 * each time CQH32_round is called, it is impossible to vectorize. */
	__asm__("" : "+r" (acc));
#endif
	return acc;
}

/* mix all bits */
static U32 CQH32_avalanche(U32 h32)
{
	h32 ^= h32 >> 15;
	h32 *= PRIME32_2;
	h32 ^= h32 >> 13;
	h32 *= PRIME32_3;
	h32 ^= h32 >> 16;
	return(h32);
}

#define CQH_get32bits(p) CQH_readLE32_align(p, align)

static U32
CQH32_finalize(U32 h32, const void* ptr, size_t len, CQH_alignment align)

{
	const BYTE* p = (const BYTE*)ptr;

#define PROCESS1               \
    h32 += (*p++) * PRIME32_5; \
    h32 = CQH_rotl32(h32, 11) * PRIME32_1 ;

#define PROCESS4                         \
    h32 += CQH_get32bits(p) * PRIME32_3; \
    p+=4;                                \
    h32  = CQH_rotl32(h32, 17) * PRIME32_4 ;

	switch (len & 15)  /* or switch(bEnd - p) */
	{
	case 12:      PROCESS4;
		/* fallthrough */
	case 8:       PROCESS4;
		/* fallthrough */
	case 4:       PROCESS4;
		return CQH32_avalanche(h32);

	case 13:      PROCESS4;
		/* fallthrough */
	case 9:       PROCESS4;
		/* fallthrough */
	case 5:       PROCESS4;
		PROCESS1;
		return CQH32_avalanche(h32);

	case 14:      PROCESS4;
		/* fallthrough */
	case 10:      PROCESS4;
		/* fallthrough */
	case 6:       PROCESS4;
		PROCESS1;
		PROCESS1;
		return CQH32_avalanche(h32);

	case 15:      PROCESS4;
		/* fallthrough */
	case 11:      PROCESS4;
		/* fallthrough */
	case 7:       PROCESS4;
		/* fallthrough */
	case 3:       PROCESS1;
		/* fallthrough */
	case 2:       PROCESS1;
		/* fallthrough */
	case 1:       PROCESS1;
		/* fallthrough */
	case 0:       return CQH32_avalanche(h32);
	}
	assert(0);
	return h32;   /* reaching this point is deemed impossible */
}

CQH_FORCE_INLINE U32
CQH32_endian_align(const void* input, size_t len, U32 seed, CQH_alignment align)
{
	const BYTE* p = (const BYTE*)input;
	const BYTE* bEnd = p + len;
	U32 h32;

#if defined(CQH_ACCEPT_NULL_INPUT_POINTER) && (CQH_ACCEPT_NULL_INPUT_POINTER>=1)
	if (p == NULL) {
		len = 0;
		bEnd = p = (const BYTE*)(size_t)16;
	}
#endif

	if (len >= 16) {
		const BYTE* const limit = bEnd - 15;
		U32 v1 = seed + PRIME32_1 + PRIME32_2;
		U32 v2 = seed + PRIME32_2;
		U32 v3 = seed + 0;
		U32 v4 = seed - PRIME32_1;

		do {
			v1 = CQH32_round(v1, CQH_get32bits(p)); p += 4;
			v2 = CQH32_round(v2, CQH_get32bits(p)); p += 4;
			v3 = CQH32_round(v3, CQH_get32bits(p)); p += 4;
			v4 = CQH32_round(v4, CQH_get32bits(p)); p += 4;
		} while (p < limit);

		h32 = CQH_rotl32(v1, 1) + CQH_rotl32(v2, 7)
			+ CQH_rotl32(v3, 12) + CQH_rotl32(v4, 18);
	}
	else {
		h32 = seed + PRIME32_5;
	}

	h32 += (U32)len;

	return CQH32_finalize(h32, p, len & 15, align);
}


CQH_PUBLIC_API unsigned int CQH32(const void* input, size_t len, unsigned int seed)
{
#if 0
	/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
	CQH32_state_t state;
	CQH32_reset(&state, seed);
	CQH32_update(&state, input, len);
	return CQH32_digest(&state);

#else

	if (CQH_FORCE_ALIGN_CHECK) {
		if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
			return CQH32_endian_align(input, len, seed, CQH_aligned);
		}
	}

	return CQH32_endian_align(input, len, seed, CQH_unaligned);
#endif
}



/*======   Hash streaming   ======*/

CQH_PUBLIC_API CQH32_state_t* CQH32_createState(void)
{
	return (CQH32_state_t*)CQH_malloc(sizeof(CQH32_state_t));
}
CQH_PUBLIC_API CQH_errorcode CQH32_freeState(CQH32_state_t* statePtr)
{
	CQH_free(statePtr);
	return CQH_OK;
}

CQH_PUBLIC_API void CQH32_copyState(CQH32_state_t* dstState, const CQH32_state_t* srcState)
{
	memcpy(dstState, srcState, sizeof(*dstState));
}

CQH_PUBLIC_API CQH_errorcode CQH32_reset(CQH32_state_t* statePtr, unsigned int seed)
{
	CQH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
	memset(&state, 0, sizeof(state));
	state.v1 = seed + PRIME32_1 + PRIME32_2;
	state.v2 = seed + PRIME32_2;
	state.v3 = seed + 0;
	state.v4 = seed - PRIME32_1;
	/* do not write into reserved, planned to be removed in a future version */
	memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
	return CQH_OK;
}


CQH_PUBLIC_API CQH_errorcode
CQH32_update(CQH32_state_t* state, const void* input, size_t len)
{
	if (input == NULL)
#if defined(CQH_ACCEPT_NULL_INPUT_POINTER) && (CQH_ACCEPT_NULL_INPUT_POINTER>=1)
		return CQH_OK;
#else
		return CQH_ERROR;
#endif

	{   const BYTE* p = (const BYTE*)input;
	const BYTE* const bEnd = p + len;

	state->total_len_32 += (CQH32_hash_t)len;
	state->large_len |= (CQH32_hash_t)((len >= 16) | (state->total_len_32 >= 16));

	if (state->memsize + len < 16) {   /* fill in tmp buffer */
		CQH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
		state->memsize += (CQH32_hash_t)len;
		return CQH_OK;
	}

	if (state->memsize) {   /* some data left from previous update */
		CQH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16 - state->memsize);
		{   const U32* p32 = state->mem32;
		state->v1 = CQH32_round(state->v1, CQH_readLE32(p32)); p32++;
		state->v2 = CQH32_round(state->v2, CQH_readLE32(p32)); p32++;
		state->v3 = CQH32_round(state->v3, CQH_readLE32(p32)); p32++;
		state->v4 = CQH32_round(state->v4, CQH_readLE32(p32));
		}
		p += 16 - state->memsize;
		state->memsize = 0;
	}

	if (p <= bEnd - 16) {
		const BYTE* const limit = bEnd - 16;
		U32 v1 = state->v1;
		U32 v2 = state->v2;
		U32 v3 = state->v3;
		U32 v4 = state->v4;

		do {
			v1 = CQH32_round(v1, CQH_readLE32(p)); p += 4;
			v2 = CQH32_round(v2, CQH_readLE32(p)); p += 4;
			v3 = CQH32_round(v3, CQH_readLE32(p)); p += 4;
			v4 = CQH32_round(v4, CQH_readLE32(p)); p += 4;
		} while (p <= limit);

		state->v1 = v1;
		state->v2 = v2;
		state->v3 = v3;
		state->v4 = v4;
	}

	if (p < bEnd) {
		CQH_memcpy(state->mem32, p, (size_t)(bEnd - p));
		state->memsize = (unsigned)(bEnd - p);
	}
	}

	return CQH_OK;
}


CQH_PUBLIC_API unsigned int CQH32_digest(const CQH32_state_t* state)
{
	U32 h32;

	if (state->large_len) {
		h32 = CQH_rotl32(state->v1, 1)
			+ CQH_rotl32(state->v2, 7)
			+ CQH_rotl32(state->v3, 12)
			+ CQH_rotl32(state->v4, 18);
	}
	else {
		h32 = state->v3 /* == seed */ + PRIME32_5;
	}

	h32 += state->total_len_32;

	return CQH32_finalize(h32, state->mem32, state->memsize, CQH_aligned);
}


/*======   Canonical representation   ======*/

/*! Default CQH result types are basic unsigned 32 and 64 bits.
*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
*   These functions allow transformation of hash result into and from its canonical format.
*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
*/

CQH_PUBLIC_API void CQH32_canonicalFromHash(CQH32_canonical_t* dst, CQH32_hash_t hash)
{
	CQH_STATIC_ASSERT(sizeof(CQH32_canonical_t) == sizeof(CQH32_hash_t));
	if (CQH_CPU_LITTLE_ENDIAN) hash = CQH_swap32(hash);
	memcpy(dst, &hash, sizeof(*dst));
}

CQH_PUBLIC_API CQH32_hash_t CQH32_hashFromCanonical(const CQH32_canonical_t* src)
{
	return CQH_readBE32(src);
}


#ifndef CQH_NO_LONG_LONG

/* *******************************************************************
*  64-bit hash functions
*********************************************************************/

/*======   Memory access   ======*/

#ifndef MEM_MODULE
# define MEM_MODULE
# if !defined (__VMS) \
  && (defined (__cplusplus) \
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
#   include <stdint.h>
typedef uint64_t U64;
# else
	/* if compiler doesn't support unsigned long long, replace by another 64-bit type */
typedef unsigned long long U64;
# endif
#endif


#if (defined(CQH_FORCE_MEMORY_ACCESS) && (CQH_FORCE_MEMORY_ACCESS==2))

/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
static U64 CQH_read64(const void* memPtr) { return *(const U64*)memPtr; }

#elif (defined(CQH_FORCE_MEMORY_ACCESS) && (CQH_FORCE_MEMORY_ACCESS==1))

/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
static U64 CQH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }

#else

/* portable and safe solution. Generally efficient.
 * see : http://stackoverflow.com/a/32095106/646947
 */

static U64 CQH_read64(const void* memPtr)
{
	U64 val;
	memcpy(&val, memPtr, sizeof(val));
	return val;
}

#endif   /* CQH_FORCE_DIRECT_MEMORY_ACCESS */

#if defined(_MSC_VER)     /* Visual Studio */
#  define CQH_swap64 _byteswap_uint64
#elif CQH_GCC_VERSION >= 403
#  define CQH_swap64 __builtin_bswap64
#else
static U64 CQH_swap64(U64 x)
{
	return  ((x << 56) & 0xff00000000000000ULL) |
		((x << 40) & 0x00ff000000000000ULL) |
		((x << 24) & 0x0000ff0000000000ULL) |
		((x << 8) & 0x000000ff00000000ULL) |
		((x >> 8) & 0x00000000ff000000ULL) |
		((x >> 24) & 0x0000000000ff0000ULL) |
		((x >> 40) & 0x000000000000ff00ULL) |
		((x >> 56) & 0x00000000000000ffULL);
}
#endif

CQH_FORCE_INLINE U64 CQH_readLE64(const void* ptr)
{
	return CQH_CPU_LITTLE_ENDIAN ? CQH_read64(ptr) : CQH_swap64(CQH_read64(ptr));
}

static U64 CQH_readBE64(const void* ptr)
{
	return CQH_CPU_LITTLE_ENDIAN ? CQH_swap64(CQH_read64(ptr)) : CQH_read64(ptr);
}

CQH_FORCE_INLINE U64
CQH_readLE64_align(const void* ptr, CQH_alignment align)
{
	if (align == CQH_unaligned)
		return CQH_readLE64(ptr);
	else
		return CQH_CPU_LITTLE_ENDIAN ? *(const U64*)ptr : CQH_swap64(*(const U64*)ptr);
}


/*======   cqh64   ======*/

static const U64 PRIME64_1 = 11400714785074694791ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
static const U64 PRIME64_2 = 14029467366897019727ULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
static const U64 PRIME64_3 = 1609587929392839161ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
static const U64 PRIME64_4 = 9650029242287828579ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
static const U64 PRIME64_5 = 2870177450012600261ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */

static U64 CQH64_round(U64 acc, U64 input)
{
	acc += input * PRIME64_2;
	acc = CQH_rotl64(acc, 31);
	acc *= PRIME64_1;
	return acc;
}

static U64 CQH64_mergeRound(U64 acc, U64 val)
{
	val = CQH64_round(0, val);
	acc ^= val;
	acc = acc * PRIME64_1 + PRIME64_4;
	return acc;
}

static U64 CQH64_avalanche(U64 h64)
{
	h64 ^= h64 >> 33;
	h64 *= PRIME64_2;
	h64 ^= h64 >> 29;
	h64 *= PRIME64_3;
	h64 ^= h64 >> 32;
	return h64;
}


#define CQH_get64bits(p) CQH_readLE64_align(p, align)

static U64
CQH64_finalize(U64 h64, const void* ptr, size_t len, CQH_alignment align)
{
	const BYTE* p = (const BYTE*)ptr;

#define PROCESS1_64            \
    h64 ^= (*p++) * PRIME64_5; \
    h64 = CQH_rotl64(h64, 11) * PRIME64_1;

#define PROCESS4_64          \
    h64 ^= (U64)(CQH_get32bits(p)) * PRIME64_1; \
    p+=4;                    \
    h64 = CQH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;

#define PROCESS8_64 {        \
    U64 const k1 = CQH64_round(0, CQH_get64bits(p)); \
    p+=8;                    \
    h64 ^= k1;               \
    h64  = CQH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
}

	switch (len & 31) {
	case 24: PROCESS8_64;
		/* fallthrough */
	case 16: PROCESS8_64;
		/* fallthrough */
	case  8: PROCESS8_64;
		return CQH64_avalanche(h64);

	case 28: PROCESS8_64;
		/* fallthrough */
	case 20: PROCESS8_64;
		/* fallthrough */
	case 12: PROCESS8_64;
		/* fallthrough */
	case  4: PROCESS4_64;
		return CQH64_avalanche(h64);

	case 25: PROCESS8_64;
		/* fallthrough */
	case 17: PROCESS8_64;
		/* fallthrough */
	case  9: PROCESS8_64;
		PROCESS1_64;
		return CQH64_avalanche(h64);

	case 29: PROCESS8_64;
		/* fallthrough */
	case 21: PROCESS8_64;
		/* fallthrough */
	case 13: PROCESS8_64;
		/* fallthrough */
	case  5: PROCESS4_64;
		PROCESS1_64;
		return CQH64_avalanche(h64);

	case 26: PROCESS8_64;
		/* fallthrough */
	case 18: PROCESS8_64;
		/* fallthrough */
	case 10: PROCESS8_64;
		PROCESS1_64;
		PROCESS1_64;
		return CQH64_avalanche(h64);

	case 30: PROCESS8_64;
		/* fallthrough */
	case 22: PROCESS8_64;
		/* fallthrough */
	case 14: PROCESS8_64;
		/* fallthrough */
	case  6: PROCESS4_64;
		PROCESS1_64;
		PROCESS1_64;
		return CQH64_avalanche(h64);

	case 27: PROCESS8_64;
		/* fallthrough */
	case 19: PROCESS8_64;
		/* fallthrough */
	case 11: PROCESS8_64;
		PROCESS1_64;
		PROCESS1_64;
		PROCESS1_64;
		return CQH64_avalanche(h64);

	case 31: PROCESS8_64;
		/* fallthrough */
	case 23: PROCESS8_64;
		/* fallthrough */
	case 15: PROCESS8_64;
		/* fallthrough */
	case  7: PROCESS4_64;
		/* fallthrough */
	case  3: PROCESS1_64;
		/* fallthrough */
	case  2: PROCESS1_64;
		/* fallthrough */
	case  1: PROCESS1_64;
		/* fallthrough */
	case  0: return CQH64_avalanche(h64);
	}

	/* impossible to reach */
	assert(0);
	return 0;  /* unreachable, but some compilers complain without it */
}

CQH_FORCE_INLINE U64
CQH64_endian_align(const void* input, size_t len, U64 seed, CQH_alignment align)
{
	const BYTE* p = (const BYTE*)input;
	const BYTE* bEnd = p + len;
	U64 h64;

#if defined(CQH_ACCEPT_NULL_INPUT_POINTER) && (CQH_ACCEPT_NULL_INPUT_POINTER>=1)
	if (p == NULL) {
		len = 0;
		bEnd = p = (const BYTE*)(size_t)32;
	}
#endif

	if (len >= 32) {
		const BYTE* const limit = bEnd - 32;
		U64 v1 = seed + PRIME64_1 + PRIME64_2;
		U64 v2 = seed + PRIME64_2;
		U64 v3 = seed + 0;
		U64 v4 = seed - PRIME64_1;

		do {
			v1 = CQH64_round(v1, CQH_get64bits(p)); p += 8;
			v2 = CQH64_round(v2, CQH_get64bits(p)); p += 8;
			v3 = CQH64_round(v3, CQH_get64bits(p)); p += 8;
			v4 = CQH64_round(v4, CQH_get64bits(p)); p += 8;
		} while (p <= limit);

		h64 = CQH_rotl64(v1, 1) + CQH_rotl64(v2, 7) + CQH_rotl64(v3, 12) + CQH_rotl64(v4, 18);
		h64 = CQH64_mergeRound(h64, v1);
		h64 = CQH64_mergeRound(h64, v2);
		h64 = CQH64_mergeRound(h64, v3);
		h64 = CQH64_mergeRound(h64, v4);

	}
	else {
		h64 = seed + PRIME64_5;
	}

	h64 += (U64)len;

	return CQH64_finalize(h64, p, len, align);
}


CQH_PUBLIC_API CQH64_hash_t CQH64(const void* input, size_t len, unsigned long long seed)
{
#if 0
	/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
	CQH64_state_t state;
	CQH64_reset(&state, seed);
	CQH64_update(&state, input, len);
	return CQH64_digest(&state);

#else

	if (CQH_FORCE_ALIGN_CHECK) {
		if ((((size_t)input) & 7) == 0) {  /* Input is aligned, let's leverage the speed advantage */
			return CQH64_endian_align(input, len, seed, CQH_aligned);
		}
	}

	return CQH64_endian_align(input, len, seed, CQH_unaligned);

#endif
}

/*======   Hash Streaming   ======*/

CQH_PUBLIC_API CQH64_state_t* CQH64_createState(void)
{
	return (CQH64_state_t*)CQH_malloc(sizeof(CQH64_state_t));
}
CQH_PUBLIC_API CQH_errorcode CQH64_freeState(CQH64_state_t* statePtr)
{
	CQH_free(statePtr);
	return CQH_OK;
}

CQH_PUBLIC_API void CQH64_copyState(CQH64_state_t* dstState, const CQH64_state_t* srcState)
{
	memcpy(dstState, srcState, sizeof(*dstState));
}

CQH_PUBLIC_API CQH_errorcode CQH64_reset(CQH64_state_t* statePtr, unsigned long long seed)
{
	CQH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
	memset(&state, 0, sizeof(state));
	state.v1 = seed + PRIME64_1 + PRIME64_2;
	state.v2 = seed + PRIME64_2;
	state.v3 = seed + 0;
	state.v4 = seed - PRIME64_1;
	/* do not write into reserved, might be removed in a future version */
	memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
	return CQH_OK;
}

CQH_PUBLIC_API CQH_errorcode
CQH64_update(CQH64_state_t* state, const void* input, size_t len)
{
	if (input == NULL)
#if defined(CQH_ACCEPT_NULL_INPUT_POINTER) && (CQH_ACCEPT_NULL_INPUT_POINTER>=1)
		return CQH_OK;
#else
		return CQH_ERROR;
#endif

	{   const BYTE* p = (const BYTE*)input;
	const BYTE* const bEnd = p + len;

	state->total_len += len;

	if (state->memsize + len < 32) {  /* fill in tmp buffer */
		CQH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
		state->memsize += (U32)len;
		return CQH_OK;
	}

	if (state->memsize) {   /* tmp buffer is full */
		CQH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32 - state->memsize);
		state->v1 = CQH64_round(state->v1, CQH_readLE64(state->mem64 + 0));
		state->v2 = CQH64_round(state->v2, CQH_readLE64(state->mem64 + 1));
		state->v3 = CQH64_round(state->v3, CQH_readLE64(state->mem64 + 2));
		state->v4 = CQH64_round(state->v4, CQH_readLE64(state->mem64 + 3));
		p += 32 - state->memsize;
		state->memsize = 0;
	}

	if (p + 32 <= bEnd) {
		const BYTE* const limit = bEnd - 32;
		U64 v1 = state->v1;
		U64 v2 = state->v2;
		U64 v3 = state->v3;
		U64 v4 = state->v4;

		do {
			v1 = CQH64_round(v1, CQH_readLE64(p)); p += 8;
			v2 = CQH64_round(v2, CQH_readLE64(p)); p += 8;
			v3 = CQH64_round(v3, CQH_readLE64(p)); p += 8;
			v4 = CQH64_round(v4, CQH_readLE64(p)); p += 8;
		} while (p <= limit);

		state->v1 = v1;
		state->v2 = v2;
		state->v3 = v3;
		state->v4 = v4;
	}

	if (p < bEnd) {
		CQH_memcpy(state->mem64, p, (size_t)(bEnd - p));
		state->memsize = (unsigned)(bEnd - p);
	}
	}

	return CQH_OK;
}


CQH_PUBLIC_API CQH64_hash_t CQH64_digest(const CQH64_state_t* state)
{
	U64 h64;

	if (state->total_len >= 32) {
		U64 const v1 = state->v1;
		U64 const v2 = state->v2;
		U64 const v3 = state->v3;
		U64 const v4 = state->v4;

		h64 = CQH_rotl64(v1, 1) + CQH_rotl64(v2, 7) + CQH_rotl64(v3, 12) + CQH_rotl64(v4, 18);
		h64 = CQH64_mergeRound(h64, v1);
		h64 = CQH64_mergeRound(h64, v2);
		h64 = CQH64_mergeRound(h64, v3);
		h64 = CQH64_mergeRound(h64, v4);
	}
	else {
		h64 = state->v3 /*seed*/ + PRIME64_5;
	}

	h64 += (U64)state->total_len;

	return CQH64_finalize(h64, state->mem64, (size_t)state->total_len, CQH_aligned);
}


/*====== Canonical representation   ======*/

CQH_PUBLIC_API void CQH64_canonicalFromHash(CQH64_canonical_t* dst, CQH64_hash_t hash)
{
	CQH_STATIC_ASSERT(sizeof(CQH64_canonical_t) == sizeof(CQH64_hash_t));
	if (CQH_CPU_LITTLE_ENDIAN) hash = CQH_swap64(hash);
	memcpy(dst, &hash, sizeof(*dst));
}

CQH_PUBLIC_API CQH64_hash_t CQH64_hashFromCanonical(const CQH64_canonical_t* src)
{
	return CQH_readBE64(src);
}


/* *********************************************************************
*  CQH3
*  New generation hash designed for speed on small keys and vectorization
************************************************************************ */

#if 0
#include "cqh3.h"
#endif

#endif  /* CQH_NO_LONG_LONG */
